> project_summary = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/project-summary.csv"
> counts_file = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/combined.counts"
> tx2genes_file = "/Users/rory/cache/yimlamai-hepatocytes/2016-08-29_yimlamai/tx2gene.csv"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
+ "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> summarydata = summarydata[, colSums(is.na(summarydata)) < nrow(summarydata)]
> # handle newer bcbio-nextgen runs that use description as the key
> if ("description" %in% colnames(summarydata)) {
+ rownames(summarydata) = summarydata$description
+ summarydata$Name = rownames(summarydata)
+ summarydata$description = NULL
+ } else {
+ rownames(summarydata) = summarydata$Name
+ # summarydata$Name = NULL
+ }
> summarydata = summarydata[order(rownames(summarydata)), ]
> if (file.exists(tx2genes_file)) {
+ sample_dirs = file.path(dirname(project_summary), "..", rownames(summarydata))
+ salmon_files = file.path(sample_dirs, "salmon", "quant.sf")
+ sailfish_files = file.path(sample_dirs, "sailfish", "quant.sf")
+ new_sailfish = file.path(sample_dirs, "sailfish", "quant", "quant.sf")
+ new_salmon = file.path(sample_dirs, "salmon", "quant", "quant.sf")
+ if (file.exists(salmon_files[1])) {
+ sf_files = salmon_files
+ } else if (file.exists(sailfish_files[1])) {
+ sf_files = sailfish_files
+ } else if (file.exists(new_sailfish[1])) {
+ sf_files = new_sailfish
+ } else if (file.exists(new_salmon[1])) {
+ sf_files = new_salmon
+ }
+ names(sf_files) = rownames(summarydata)
+ tx2gene = read.table(tx2genes_file, sep = ",", row.names = NULL, header = FALSE)
+ txi.salmon = tximport(sf_files, type = "salmon", tx2gene = tx2gene, reader = readr::read_tsv,
+ countsFromAbundance = "lengthScaledTPM")
+ counts = round(data.frame(txi.salmon$counts, check.names = FALSE))
+ } else {
+ counts = read.table(counts_file, header = TRUE, row.names = "id", check.names = FALSE)
+ }
> counts = counts[, order(colnames(counts)), drop = FALSE]
> colnames(counts) = gsub(".counts", "", colnames(counts))
>
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality",
+ "rRNA_rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate",
+ "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped",
+ "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.",
+ "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity",
+ "X5.3.bias", "Duplicates.pct", "Duplicates", "Mapped.reads", "Average.insert.size",
+ "Mapped.reads.pct", "Total.reads", "avg_coverage_per_region", "Mapped.Reads")
> summarydata[, "Fragment.Length.Mean"] = summarydata$Average.insert.size
> summarydata$gt = paste(summarydata$genotype, summarydata$treatment, sep = "_")
> metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
> metadata = metadata[, colSums(is.na(metadata)) < nrow(metadata), drop = FALSE]
> sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-",
+ "_", colnames(df)))
+ }
> # set seed for reproducibility
> set.seed(1454944673)
> get_heatmap_fn = function(summarydata) {
+ # return the pheatmap function with or without metadata
+ if (ncol(metadata) == 0) {
+ return(pheatmap)
+ } else {
+ # rownames(metadata) = summarydata$Name
+ heatmap_fn = function(data, ...) {
+ pheatmap(data, annotation = metadata, clustering_method = "ward.D2",
+ clustering_distance_cols = "correlation", ...)
+ }
+ return(heatmap_fn)
+ }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)
> qualimap_run = "Mapped" %in% colnames(summarydata)
> do_quality = "Mapped.reads" %in% colnames(summarydata)
Mapped reads looks great, there is some variation but that is to be expected.
> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
> ggplot(summarydata, aes(x = Name, y = Mapped.reads)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
The genomic mapping rate looks excellent for all of the samples.
> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
> ggplot(summarydata, aes(x = Name, y = Mapped.reads.pct)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
We see around the same number of genes detected in each sample, which is another good sign.
> dd = data.frame(Name = colnames(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") +
+ xlab("")
The exonic mapping rate is a measurement of how much RNA we actually sequenced; if the genomic mapping rate is super high and the exonic mapping rate is low, we have DNA contamination. There are a couple samples that look like they might have some DNA contamination, but nothing absolutely horrible.
> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") +
+ xlab("")
These rates are low for all samples, which is good.
> eval_rRNA = "rRNA_rate" %in% colnames(summarydata) & !sum(is.na(summarydata$rRNA_rate)) ==
+ nrow(summarydata)
> ggplot(summarydata, aes(x = Name, y = rRNA_rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") +
+ xlab("")
This is round 1, which is good– if it deviates much from 1 then it is indicative of degradation of the RNA.
> ggplot(summarydata, aes(x = Name, y = X5.3.bias)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("5'->3' bias") +
+ xlab("")
The distributions of counts looks pretty similar across all of the samples.
> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Normalizing makes them much more similar, which is good. Trimmed mean of M-values (TMM) normalization is described here
Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Samples of the same genotype and treatment cluster together by Spearman correlation which is a good sign.
> heatmap_fn(cor(normalized_counts, method = "pearson"))
> heatmap_fn(cor(normalized_counts, method = "spearman"))
Samples separate very cleanly along the 1st and 2nd principal components by genotype and treatment. This is good news for differential expression analyses.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> vst = varianceStabilizingTransformation(dds)
> pca_loadings = function(object, ntop = 500) {
+ rv <- matrixStats::rowVars(assay(object))
+ select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+ pca <- prcomp(t(assay(object)[select, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_loadings(vst)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot = function(comps, nc1, nc2, colorby) {
+ c1str = paste0("PC", nc1)
+ c2str = paste0("PC", nc2)
+ if (!(c1str %in% colnames(comps) && c2str %in% colnames(comps))) {
+ warning("Higher order components not found, skipping plotting.")
+ return(NA)
+ }
+ ggplot(comps, aes_string(c1str, c2str, color = colorby)) + geom_point() +
+ theme_bw() + xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100),
+ "% variance")) + ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] *
+ 100), "% variance"))
+ }
> pca_plot(comps, 1, 2, "gt")
> pca_plot(comps, 3, 4, "gt")
> pca_plot(comps, 5, 6, "gt")
> ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar),
+ percent_var = pc$percentVar), aes(component, percent_var)) + geom_bar(stat = "identity") +
+ ylab("percent of total variation") + xlab("") + theme_bw()
> # snagged from development version of DESeq
> DESeqDataSetFromTximport <- function(txi, colData, design, ...) {
+ counts <- round(txi$counts)
+ mode(counts) <- "integer"
+ dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = design,
+ ...)
+ stopifnot(txi$countsFromAbundance %in% c("no", "scaledTPM", "lengthScaledTPM"))
+ if (txi$countsFromAbundance %in% c("scaledTPM", "lengthScaledTPM")) {
+ message("using length scaled TPM counts from tximport")
+ } else {
+ message("using counts and average transcript lengths from tximport")
+ lengths <- txi$length
+ dimnames(lengths) <- dimnames(dds)
+ assays(dds)[["avgTxLength"]] <- lengths
+ }
+ return(dds)
+ }
>
> subset_tximport = function(txi, rows, columns) {
+ txi$counts = txi$counts[rows, columns]
+ txi$abundance = txi$abundance[rows, columns]
+ txi$length = txi$length[rows, columns]
+ return(txi)
+ }
> deseq2resids = function(dds) {
+ # calculate residuals for a deseq2 fit
+ fitted = t(t(assays(dds)[["mu"]])/sizeFactors(dds))
+ return(counts(dds, normalized = TRUE) - fitted)
+ }
> library(DEGreport)
> library(vsn)
> design = ~genotype + treatment + genotype:treatment
> summarydata$treatment = relevel(summarydata$treatment, "untreated")
> summarydata$genotype = relevel(summarydata$genotype, "wt")
Here we fit a model that looks like ~genotype + treatment + genotype:treatment. The coefficient of genotype will be the effect due to it being a knockout, when there has been no treatment. The coefficient of treatment will be the effect of effect of treatment on the control cells. The coefficient of genotype:treatment will be the effect of treatment on the knockout cells.
> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+ txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+ dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+ dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row !=
+ 0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
These dispersion estimates look normal– the red line is the fitted value for the mean-variance relationship, the black dots are the gene-wise dispersion values and the blue dots are the gene-wise dispersions shrunk back to the fitted line.
> plotDispEsts(dds)
> library(biomaRt)
> biomart_dataset = "mmusculus_gene_ensembl"
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> symbols = biomaRt::getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), mart = mart)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
>
> plotMA_full = function(res) {
+ ymax = max(res$log2FoldChange, na.rm = TRUE)
+ ymin = min(res$log2FoldChange, na.rm = TRUE)
+ plotMA(res, ylim = c(ymin, ymax))
+ }
> volcano = function(res) {
+ stats = as.data.frame(res[, c(2, 6)])
+ p = volcano_density_plot(stats, lfc.cutoff = 1.5)
+ print(p)
+ }
> markup_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ensembl_gene_id")) %>%
+ arrange(padj)
+ return(out_df)
+ }
> write_deseq = function(res, fname) {
+ out_df = as.data.frame(res)
+ out_df$id = rownames(out_df)
+ out_df = out_df[, c("id", colnames(out_df)[colnames(out_df) != "id"])]
+ out_df = out_df %>% left_join(symbols, by = c(id = "ensembl_gene_id")) %>%
+ arrange(padj)
+ write.table(out_df, file = fname, quote = FALSE, sep = ",", row.names = FALSE,
+ col.names = TRUE)
+ }
> of_interest = c("Tgfb1", "Tgfb2", "Pdgfc", "Il6")
> ko = results(dds, name = "genotype_yaptaz_vs_wt")
> komarked = markup_deseq(ko)
> plotMA_full(ko)
> volcano(ko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1119]
> write_deseq(ko, "knockout-effect.csv")
There are 308 genes affected by the knockout.
> treatwt = results(dds, name = "treatment_ccl4_vs_untreated")
> treatwtmarked = markup_deseq(treatwt)
> plotMA_full(treatwt)
> volcano(treatwt)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1286]
> write_deseq(treatwt, "treatment-effect-on-wildtype.csv")
There are 513 genes affected by the CCL4 treatment in the wildtype.
> treatko = results(dds, list(c("treatment_ccl4_vs_untreated", "genotypeyaptaz.treatmentccl4")))
> treatkomarked = markup_deseq(treatko)
> plotMA_full(treatko)
> volcano(treatko)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1453]
> write_deseq(treatko, "treatment-effect-on-knockout.csv")
There are 3176 genes affected by the CCL4 treatment in the knockout.
> treatkospecific = results(dds, name = "genotypeyaptaz.treatmentccl4")
> plotMA_full(treatkospecific)
> volcano(treatkospecific)
TableGrob (2 x 2) "arrange": 3 grobs
z cells name grob
1 1 (2-2,1-1) arrange gtable[arrange]
2 2 (2-2,2-2) arrange gtable[arrange]
3 3 (1-1,1-2) arrange text[GRID.text.1620]
> treatkospecificmarked = markup_deseq(treatkospecific)
> write_deseq(treatkospecific, "specific-effect-on-knockout.csv")
There are 796 genes specifically affected differently by the CCL4 treatment in the knockout compared to the wildtype.
These tables have the format:
id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,mgi_symbol
> orgdb = "org.Mm.eg.db"
> biomart_dataset = "mmusculus_gene_ensembl"
> keggname = "mmu"
> library(dplyr)
> library(clusterProfiler)
> library(orgdb, character.only = TRUE)
> library(biomaRt)
> mart = biomaRt::useMart(biomart = "ensembl", dataset = biomart_dataset)
> entrez = biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene"), mart = mart)
> entrez$entrezgene = as.character(entrez$entrezgene)
> entrezsymbol = biomaRt::getBM(attributes = c("mgi_symbol", "entrezgene"), mart = mart)
> entrezsymbol$entrezgene = as.character(entrezsymbol$entrezgene)
> summarize_cp = function(res, comparison) {
+ summaries = data.frame()
+ for (ont in names(res)) {
+ ontsum = summary(res[[ont]])
+ ontsum$ont = ont
+ summaries = rbind(summaries, ontsum)
+ }
+ summaries$comparison = comparison
+ return(summaries)
+ }
>
> enrich_cp = function(res, comparison) {
+ res = res %>% data.frame() %>% tibble::rownames_to_column() %>% left_join(entrez,
+ by = c(rowname = "ensembl_gene_id")) %>% filter(!is.na(entrezgene))
+ universe = res$entrezgene
+ genes = subset(res, padj < 0.05)$entrezgene
+ mf = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "MF", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ cc = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "CC", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ bp = enrichGO(genes, universe = universe, OrgDb = orgdb, ont = "BP", pAdjustMethod = "BH",
+ qvalueCutoff = 1, pvalueCutoff = 1)
+ kg = enrichKEGG(gene = genes, universe = universe, organism = "mmu", pvalueCutoff = 1,
+ qvalueCutoff = 1, pAdjustMethod = "BH")
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
> gsea_cp = function(res, comparison) {
+ res = res %>% left_join(entrez, by = c(rowname = "ensembl_gene_id")) %>%
+ filter(!is.na(entrezgene)) %>% filter(!is.na(log2FoldChange)) %>% filter(!is.na(lfcSE))
+ lfc = data.frame(res)[, "log2FoldChange"]
+ lfcse = data.frame(res)[, "lfcSE"]
+ genes = lfc/lfcse
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ cc = gseGO(genes, ont = "CC", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ mf = gseGO(genes, ont = "MF", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ bp = gseGO(genes, ont = "bp", OrgDb = orgdb, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE)
+ genes = data.frame(res)[, "log2FoldChange"]
+ names(genes) = res$entrezgene
+ genes = genes[order(genes, decreasing = TRUE)]
+ genes = genes[!is.na(genes)]
+ kg = gseKEGG(geneList = genes, organism = keggname, nPerm = 500, pvalueCutoff = 1,
+ verbose = TRUE)
+ if (orgdb == "org.Hs.eg.db") {
+ do = summary(gseDO(geneList = genes, nPerm = 500, pvalueCutoff = 1,
+ pAdjustMethod = "BH", verbose = TRUE))
+ do$ont = "DO"
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg, do = do)
+ } else {
+ all = list(mf = mf, cc = cc, bp = bp, kg = kg)
+ }
+ all[["summary"]] = summarize_cp(all, comparison)
+ return(all)
+ }
>
> convert_core_ids = function(res) {
+ res = res %>% mutate(geneID = strsplit(as.character(geneID), "/")) %>% tidyr::unnest(geneID) %>%
+ left_join(entrezsymbol, by = c(geneID = "entrezgene")) %>% group_by(ID,
+ Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue, Count, ont,
+ comparison) %>% summarise(geneID = paste(geneID, collapse = "/"), symbol = paste(mgi_symbol,
+ collapse = "/"))
+ return(res)
+ }
> library(readr)
> ko_enrich = enrich_cp(ko, "knockout-effect")
> ko_out = convert_core_ids(subset(ko_enrich$summary, qvalue < 0.05))
> write_csv(ko_out, "knockout-effect-pathways.csv")
> treatwt_enrich = enrich_cp(treatwt, "treatment-effect-on-wt")
> treatwt_out = convert_core_ids(subset(treatwt_enrich$summary, qvalue < 0.05))
> write_csv(treatwt_out, "treatment-effect-on-wt-pathways.csv")
> treatko_enrich = enrich_cp(treatko, "treatment-effect-on-ko")
> treatko_out = convert_core_ids(subset(treatko_enrich$summary, qvalue < 0.05))
> write_csv(treatko_out, "treatment-effect-on-ko-pathways.csv")
> treatkospecific_enrich = enrich_cp(treatkospecific, "specific-treatment-effect-on-ko")
> treatkospecific_out = convert_core_ids(subset(treatkospecific_enrich$summary,
+ qvalue < 0.05))
> write_csv(treatkospecific_out, "specific-effect-on-ko-pathways.csv")
Treatment effect on wildtype, pathways
> tpm = txi.salmon$abundance %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(symbols, by = c(rowname = "ensembl_gene_id"))
> write_csv(tpm, "tpm.csv")
> summarydata$sample = rownames(summarydata)
> il1bvals = tpm %>% dplyr::filter(mgi_symbol == "Il1b") %>% tidyr::gather(sample,
+ tpm, -mgi_symbol, -rowname) %>% left_join(summarydata, by = "sample")
> ggplot(il1bvals, aes(sample, tpm)) + geom_point() + theme(axis.text.x = element_text(angle = 90,
+ hjust = 1))
> ggplot(subset(il1bvals, tpm < 50), aes(sample, tpm)) + geom_point() + theme(axis.text.x = element_text(angle = 90,
+ hjust = 1))
Dean posed this question:
Could you check if our Hippo gene signature is significantly changed in the
context of injury. Is it normalized after injury when Yap/Taz are knocked out? I
have included the a file called "Yap Targets.grp" which documents genes
typically changed in cell lines when Hippo Signaling is inactivated. Usually we
use the GSEA suite to examine the significance.
We could do GSEA, but it will make it tough to answer the question ‘is it normalized’. Here we will try to answer that question by measuring the distance between all samples via PCA, using those genes to query. The file we were sent is in human symbol form, though. So we have to convert that first:
> library(biomaRt)
> human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
> mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
> symbolconv = getLDS(attributes = c("mgi_symbol", "ensembl_gene_id"), mart = mouse,
+ attributesL = c("hgnc_symbol"), martL = human)
> colnames(symbolconv) = c("mouse", "id", "human")
> target_fn = "/Volumes/Clotho/Users/rory/cache/yimlamai-hepatocytes/metadata/yap-targets.grp"
> extras = data.frame(symbol = c("NOCT", "ASAP1", "COLGAT1", "IQCJ-SCHIP1", "AGFG2"))
> targets = read_csv(target_fn, skip = 1, col_names = c("symbol"))
> targets = rbind(targets, extras)
> targets = targets %>% left_join(symbolconv, by = c(symbol = "human"))
> genes = unique(targets$id)[!is.na(unique(targets$id))]
> genes = genes[genes %in% rownames(counts)]
Below at the PCA plots using those genes. This doesn’t look too good for the hypothesis that treating the Yap/Taz knockout samples normalizes these genes after injury. If this made the Yap/Taz cells more like the WT cells, we’d expect them to overlap. Instead we see the CCL4 treated YapTaz knockout cells are further away.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> dds = estimateSizeFactors(dds)
> vst = varianceStabilizingTransformation(dds)
> pca_yap = function(object, genes) {
+ pca <- prcomp(t(assay(object)[genes, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_yap(vst, genes)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot(comps, 1, 2, "gt")
Heirarchical clustering also doesn’t bear out this hypothesis, using the normalized counts:
> ncounts = log(counts(dds, normalized = TRUE) + 1)
> toheatmap = ncounts[genes, ] %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(targets, by = c(rowname = "id")) %>% na.omit()
> toplot = data.frame(toheatmap, check.names = FALSE)
> rownames(toplot) = toplot$mouse
> toplot = toplot[, !colnames(toplot) %in% c("symbol", "mouse", "id", "rowname")]
> heatmap_fn(toplot, fontsize = 7)
This is the same as above, but using the Piccolo lab genes:
> library(biomaRt)
> human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
> mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
> symbolconv = getLDS(attributes = c("mgi_symbol", "ensembl_gene_id"), mart = mouse,
+ attributesL = c("hgnc_symbol"), martL = human)
> colnames(symbolconv) = c("mouse", "id", "human")
> target_fn = "/Volumes/Clotho/Users/rory/cache/yimlamai-hepatocytes/metadata/piccolo.txt"
> targets = read_csv(target_fn, col_names = c("symbol"))
> targets = targets %>% left_join(symbolconv, by = c(symbol = "human"))
> genes = unique(targets$id)[!is.na(unique(targets$id))]
> genes = genes[genes %in% rownames(counts)]
We sees a similar pattern as the above, CCL4 treatment make the YapTaz cells more different than the untreated cells, not less.
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> dds = estimateSizeFactors(dds)
> vst = varianceStabilizingTransformation(dds)
> pca_yap = function(object, genes) {
+ pca <- prcomp(t(assay(object)[genes, ]))
+ percentVar <- pca$sdev^2/sum(pca$sdev^2)
+ names(percentVar) = colnames(pca$x)
+ pca$percentVar = percentVar
+ return(pca)
+ }
> pc = pca_yap(vst, genes)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot(comps, 1, 2, "gt")
Heirarchical clustering also doesn’t bear out this hypothesis.
> ncounts = log(counts(dds, normalized = TRUE) + 1)
> toheatmap = ncounts[genes, ] %>% as.data.frame() %>% tibble::rownames_to_column() %>%
+ left_join(targets, by = c(rowname = "id")) %>% na.omit()
> toplot = data.frame(toheatmap, check.names = FALSE)
> rownames(toplot) = toplot$mouse
> toplot = toplot[, !colnames(toplot) %in% c("symbol", "mouse", "id", "rowname")]
> heatmap_fn(toplot, fontsize = 7)